## 33 DNA sequences in binary format stored in a list.
##
## Mean sequence length: 1982.879
## Shortest sequence: 931
## Longest sequence: 2920
##
## Labels:
## arti_JF806202
## arti_HM161150
## arti_FJ356743
## arti_JF806205
## arti_JQ073190
## arti_GU457971
## ...
##
## Base composition:
## a c g t
## 0.312 0.205 0.233 0.250
## (Total: 65.44 kb)
Base compositions of simulated data are approximately the same as original data, which is exactly what we expect.
## 33 DNA sequences in binary format stored in a matrix.
##
## All sequences of same length: 1983
##
## Labels:
## arti_JF806202
## arti_HM161150
## arti_FJ356743
## arti_JF806205
## arti_JQ073190
## arti_GU457971
## ...
##
## Base composition:
## a c g t
## 0.310 0.205 0.234 0.252
## (Total: 65.44 kb)
We take the second simulated sequences from phylogenetic tree as example, refer to the details as follows. And there are no stop codons detected in neither sequences.
## [1] "Here come the brief about simulated sequences:"
## For sequence 1 :
## *Base composition:
## a c g t
## 0.3121533 0.2208775 0.2329803 0.2339889
## *GC content:
## [1] 0.4538578
## *AT content:
## [1] 0.5461422
## *number of stop codons:
## [1] 0
##
## For sequence 2 :
## *Base composition:
## a c g t
## 0.3187090 0.1835603 0.2491175 0.2486132
## *GC content:
## [1] 0.4326778
## *AT content:
## [1] 0.5673222
## *number of stop codons:
## [1] 0
##
## For sequence 3 :
## *Base composition:
## a c g t
## 0.2904690 0.2042360 0.2410489 0.2642461
## *GC content:
## [1] 0.4452849
## *AT content:
## [1] 0.5547151
## *number of stop codons:
## [1] 0
##
## For sequence 4 :
## *Base composition:
## a c g t
## 0.3030761 0.2244075 0.2299546 0.2425618
## *GC content:
## [1] 0.4543621
## *AT content:
## [1] 0.5456379
## *number of stop codons:
## [1] 0
##
## For sequence 5 :
## *Base composition:
## a c g t
## 0.3010590 0.2168432 0.2344932 0.2476046
## *GC content:
## [1] 0.4513364
## *AT content:
## [1] 0.5486636
## *number of stop codons:
## [1] 0
##
## For sequence 6 :
## *Base composition:
## a c g t
## 0.3111447 0.2148260 0.2279375 0.2460918
## *GC content:
## [1] 0.4427635
## *AT content:
## [1] 0.5572365
## *number of stop codons:
## [1] 0
##
## For sequence 7 :
## *Base composition:
## a c g t
## 0.3071104 0.2178517 0.2334846 0.2415532
## *GC content:
## [1] 0.4513364
## *AT content:
## [1] 0.5486636
## *number of stop codons:
## [1] 0
##
## For sequence 8 :
## *Base composition:
## a c g t
## 0.3055976 0.2128089 0.2430661 0.2385275
## *GC content:
## [1] 0.4558749
## *AT content:
## [1] 0.5441251
## *number of stop codons:
## [1] 0
##
## For sequence 9 :
## *Base composition:
## a c g t
## 0.3101362 0.1976803 0.2390318 0.2531518
## *GC content:
## [1] 0.4367121
## *AT content:
## [1] 0.5632879
## *number of stop codons:
## [1] 0
##
## For sequence 10 :
## *Base composition:
## a c g t
## 0.3192133 0.2047403 0.2274332 0.2486132
## *GC content:
## [1] 0.4321735
## *AT content:
## [1] 0.5678265
## *number of stop codons:
## [1] 0
##
## For sequence 11 :
## *Base composition:
## a c g t
## 0.2995461 0.2153303 0.2334846 0.2516389
## *GC content:
## [1] 0.4488149
## *AT content:
## [1] 0.5511851
## *number of stop codons:
## [1] 0
##
## For sequence 12 :
## *Base composition:
## a c g t
## 0.3151790 0.1946546 0.2349975 0.2551689
## *GC content:
## [1] 0.429652
## *AT content:
## [1] 0.570348
## *number of stop codons:
## [1] 0
##
## For sequence 13 :
## *Base composition:
## a c g t
## 0.2980333 0.2007060 0.2339889 0.2672718
## *GC content:
## [1] 0.4346949
## *AT content:
## [1] 0.5653051
## *number of stop codons:
## [1] 0
##
## For sequence 14 :
## *Base composition:
## a c g t
## 0.2995461 0.2007060 0.2269289 0.2728190
## *GC content:
## [1] 0.4276349
## *AT content:
## [1] 0.5723651
## *number of stop codons:
## [1] 0
##
## For sequence 15 :
## *Base composition:
## a c g t
## 0.3116490 0.1951589 0.2334846 0.2597075
## *GC content:
## [1] 0.4286435
## *AT content:
## [1] 0.5713565
## *number of stop codons:
## [1] 0
##
## For sequence 16 :
## *Base composition:
## a c g t
## 0.3166919 0.2067574 0.2178517 0.2586989
## *GC content:
## [1] 0.4246092
## *AT content:
## [1] 0.5753908
## *number of stop codons:
## [1] 0
##
## For sequence 17 :
## *Base composition:
## a c g t
## 0.3045890 0.2052446 0.2405446 0.2496218
## *GC content:
## [1] 0.4457892
## *AT content:
## [1] 0.5542108
## *number of stop codons:
## [1] 0
##
## For sequence 18 :
## *Base composition:
## a c g t
## 0.3262733 0.1981846 0.2284418 0.2471004
## *GC content:
## [1] 0.4266263
## *AT content:
## [1] 0.5733737
## *number of stop codons:
## [1] 0
##
## For sequence 19 :
## *Base composition:
## a c g t
## 0.3071104 0.2173475 0.2304589 0.2450832
## *GC content:
## [1] 0.4478064
## *AT content:
## [1] 0.5521936
## *number of stop codons:
## [1] 0
##
## For sequence 20 :
## *Base composition:
## a c g t
## 0.3131619 0.2032274 0.2304589 0.2531518
## *GC content:
## [1] 0.4336863
## *AT content:
## [1] 0.5663137
## *number of stop codons:
## [1] 0
##
## For sequence 21 :
## *Base composition:
## a c g t
## 0.3116490 0.1996974 0.2360061 0.2526475
## *GC content:
## [1] 0.4357035
## *AT content:
## [1] 0.5642965
## *number of stop codons:
## [1] 0
##
## For sequence 22 :
## *Base composition:
## a c g t
## 0.3015633 0.2163389 0.2344932 0.2476046
## *GC content:
## [1] 0.4508321
## *AT content:
## [1] 0.5491679
## *number of stop codons:
## [1] 0
##
## For sequence 23 :
## *Base composition:
## a c g t
## 0.3136662 0.2067574 0.2334846 0.2460918
## *GC content:
## [1] 0.4402421
## *AT content:
## [1] 0.5597579
## *number of stop codons:
## [1] 0
##
## For sequence 24 :
## *Base composition:
## a c g t
## 0.3187090 0.1855774 0.2329803 0.2627332
## *GC content:
## [1] 0.4185577
## *AT content:
## [1] 0.5814423
## *number of stop codons:
## [1] 0
##
## For sequence 25 :
## *Base composition:
## a c g t
## 0.3247605 0.1996974 0.2319718 0.2435703
## *GC content:
## [1] 0.4316692
## *AT content:
## [1] 0.5683308
## *number of stop codons:
## [1] 0
##
## For sequence 26 :
## *Base composition:
## a c g t
## 0.3126576 0.1875946 0.2385275 0.2612204
## *GC content:
## [1] 0.426122
## *AT content:
## [1] 0.573878
## *number of stop codons:
## [1] 0
##
## For sequence 27 :
## *Base composition:
## a c g t
## 0.3116490 0.2002017 0.2324760 0.2556732
## *GC content:
## [1] 0.4326778
## *AT content:
## [1] 0.5673222
## *number of stop codons:
## [1] 0
##
## For sequence 28 :
## *Base composition:
## a c g t
## 0.3061019 0.2143217 0.2294503 0.2501261
## *GC content:
## [1] 0.4437721
## *AT content:
## [1] 0.5562279
## *number of stop codons:
## [1] 0
##
## For sequence 29 :
## *Base composition:
## a c g t
## 0.3055976 0.2077660 0.2314675 0.2551689
## *GC content:
## [1] 0.4392335
## *AT content:
## [1] 0.5607665
## *number of stop codons:
## [1] 0
##
## For sequence 30 :
## *Base composition:
## a c g t
## 0.3106404 0.2128089 0.2223903 0.2541604
## *GC content:
## [1] 0.4351992
## *AT content:
## [1] 0.5648008
## *number of stop codons:
## [1] 0
##
## For sequence 31 :
## *Base composition:
## a c g t
## 0.3071104 0.1996974 0.2481089 0.2450832
## *GC content:
## [1] 0.4478064
## *AT content:
## [1] 0.5521936
## *number of stop codons:
## [1] 0
##
## For sequence 32 :
## *Base composition:
## a c g t
## 0.3282905 0.2037317 0.2309632 0.2370146
## *GC content:
## [1] 0.4346949
## *AT content:
## [1] 0.5653051
## *number of stop codons:
## [1] 0
##
## For sequence 33 :
## *Base composition:
## a c g t
## 0.2990419 0.1966717 0.2344932 0.2697932
## *GC content:
## [1] 0.4311649
## *AT content:
## [1] 0.5688351
## *number of stop codons:
## [1] 0
## [1] "Here come the amino composition of simulated proteins:"
## For sequence 1 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.03 0.02 0.02 0.04 0.02 0.07 0.04 0.06 0.05 0.08 0.01 0.05 0.06 0.05
## [16] 0.10 0.08 0.06 0.05 0.02 0.03
##
## For sequence 2 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.04 0.03 0.03 0.03 0.03 0.06 0.02 0.08 0.07 0.08 0.01 0.04 0.02 0.04
## [16] 0.11 0.10 0.05 0.07 0.01 0.03
##
## For sequence 3 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.04 0.03 0.03 0.04 0.03 0.06 0.03 0.05 0.05 0.11 0.02 0.04 0.06 0.04
## [16] 0.08 0.09 0.06 0.06 0.02 0.02
##
## For sequence 4 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.06 0.02 0.03 0.03 0.02 0.06 0.04 0.05 0.04 0.09 0.02 0.05 0.05 0.03
## [16] 0.09 0.10 0.06 0.05 0.01 0.04
##
## For sequence 5 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.04 0.03 0.05 0.03 0.03 0.04 0.02 0.04 0.06 0.09 0.02 0.04 0.05 0.04
## [16] 0.09 0.10 0.08 0.06 0.02 0.03
##
## For sequence 6 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.04 0.02 0.03 0.03 0.02 0.06 0.02 0.07 0.03 0.10 0.02 0.05 0.05 0.04
## [16] 0.08 0.10 0.07 0.05 0.01 0.03
##
## For sequence 7 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.06 0.02 0.04 0.03 0.03 0.05 0.02 0.06 0.05 0.09 0.02 0.05 0.06 0.04
## [16] 0.08 0.09 0.05 0.06 0.02 0.02
##
## For sequence 8 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.04 0.03 0.03 0.04 0.02 0.06 0.04 0.07 0.04 0.09 0.02 0.04 0.04 0.03
## [16] 0.10 0.08 0.06 0.07 0.02 0.03
##
## For sequence 9 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.03 0.02 0.03 0.03 0.03 0.06 0.03 0.08 0.07 0.08 0.01 0.03 0.04 0.03
## [16] 0.09 0.10 0.05 0.07 0.01 0.04
##
## For sequence 10 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.07 0.03 0.02 0.03 0.03 0.06 0.04 0.07 0.07 0.09 0.02 0.04 0.03 0.03
## [16] 0.09 0.07 0.07 0.04 0.00 0.04
##
## For sequence 11 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.04 0.06 0.03 0.04 0.04 0.02 0.05 0.02 0.05 0.05 0.10 0.02 0.04 0.04 0.04
## [16] 0.09 0.09 0.07 0.05 0.01 0.04
##
## For sequence 12 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.05 0.03 0.03 0.04 0.03 0.07 0.02 0.05 0.05 0.08 0.02 0.04 0.04 0.03
## [16] 0.07 0.10 0.07 0.05 0.01 0.05
##
## For sequence 13 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.07 0.03 0.04 0.02 0.04 0.05 0.02 0.05 0.06 0.08 0.02 0.04 0.05 0.03
## [16] 0.08 0.09 0.07 0.06 0.01 0.03
##
## For sequence 14 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.03 0.01 0.02 0.04 0.03 0.06 0.04 0.07 0.05 0.11 0.02 0.03 0.04 0.04
## [16] 0.10 0.08 0.07 0.07 0.01 0.03
##
## For sequence 15 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.03 0.03 0.04 0.04 0.03 0.06 0.03 0.05 0.06 0.10 0.01 0.05 0.04 0.03
## [16] 0.07 0.11 0.06 0.05 0.01 0.03
##
## For sequence 16 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.06 0.03 0.02 0.03 0.03 0.05 0.04 0.06 0.06 0.09 0.02 0.04 0.03 0.03
## [16] 0.10 0.08 0.06 0.05 0.00 0.04
##
## For sequence 17 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.05 0.03 0.03 0.04 0.03 0.05 0.02 0.05 0.05 0.08 0.01 0.04 0.05 0.04
## [16] 0.09 0.09 0.07 0.06 0.02 0.03
##
## For sequence 18 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.05 0.03 0.04 0.03 0.03 0.06 0.02 0.07 0.05 0.07 0.02 0.05 0.03 0.04
## [16] 0.09 0.11 0.06 0.05 0.01 0.05
##
## For sequence 19 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.06 0.02 0.04 0.03 0.04 0.05 0.03 0.05 0.06 0.09 0.02 0.04 0.05 0.04
## [16] 0.08 0.10 0.06 0.05 0.01 0.03
##
## For sequence 20 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.05 0.02 0.03 0.03 0.03 0.05 0.02 0.06 0.06 0.08 0.02 0.05 0.05 0.02
## [16] 0.10 0.10 0.06 0.06 0.02 0.04
##
## For sequence 21 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.05 0.04 0.04 0.04 0.02 0.05 0.03 0.06 0.05 0.07 0.03 0.05 0.05 0.04
## [16] 0.11 0.08 0.05 0.06 0.02 0.03
##
## For sequence 22 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.05 0.03 0.04 0.03 0.03 0.04 0.02 0.05 0.05 0.09 0.02 0.04 0.04 0.04
## [16] 0.09 0.09 0.08 0.05 0.02 0.03
##
## For sequence 23 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.05 0.03 0.04 0.04 0.03 0.07 0.03 0.07 0.05 0.09 0.02 0.05 0.04 0.04
## [16] 0.07 0.09 0.06 0.05 0.01 0.04
##
## For sequence 24 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.03 0.03 0.04 0.03 0.04 0.06 0.03 0.05 0.06 0.09 0.01 0.05 0.03 0.03
## [16] 0.10 0.09 0.06 0.05 0.01 0.03
##
## For sequence 25 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.05 0.03 0.03 0.03 0.02 0.06 0.04 0.06 0.05 0.07 0.03 0.05 0.03 0.03
## [16] 0.08 0.10 0.07 0.06 0.02 0.04
##
## For sequence 26 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.05 0.02 0.03 0.05 0.03 0.06 0.04 0.05 0.05 0.08 0.02 0.04 0.04 0.04
## [16] 0.08 0.09 0.06 0.04 0.02 0.05
##
## For sequence 27 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.07 0.04 0.02 0.03 0.03 0.02 0.05 0.03 0.07 0.04 0.09 0.02 0.04 0.03 0.03
## [16] 0.08 0.10 0.06 0.07 0.01 0.04
##
## For sequence 28 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.04 0.02 0.04 0.04 0.04 0.06 0.03 0.06 0.05 0.10 0.02 0.05 0.04 0.04
## [16] 0.07 0.09 0.06 0.05 0.01 0.04
##
## For sequence 29 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.08 0.05 0.03 0.03 0.04 0.03 0.05 0.02 0.07 0.03 0.09 0.02 0.04 0.04 0.03
## [16] 0.08 0.09 0.07 0.05 0.01 0.04
##
## For sequence 30 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.05 0.04 0.03 0.03 0.04 0.03 0.05 0.03 0.05 0.05 0.10 0.02 0.05 0.04 0.04
## [16] 0.09 0.08 0.07 0.06 0.01 0.05
##
## For sequence 31 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.08 0.07 0.03 0.03 0.04 0.03 0.06 0.02 0.06 0.04 0.08 0.02 0.04 0.04 0.04
## [16] 0.10 0.08 0.05 0.06 0.00 0.04
##
## For sequence 32 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.03 0.02 0.03 0.05 0.02 0.06 0.03 0.07 0.05 0.10 0.02 0.05 0.03 0.04
## [16] 0.07 0.09 0.08 0.06 0.01 0.03
##
## For sequence 33 :
## *Amino acid composition:
## [1] "*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V"
## [20] "W" "Y"
##
## [1] 0.06 0.05 0.02 0.03 0.03 0.03 0.05 0.02 0.06 0.06 0.10 0.02 0.04 0.04 0.04
## [16] 0.08 0.10 0.05 0.06 0.02 0.05
## [1] "Here come the number of stop codons in true sequences:"
## [1] 0
Fitted model from simulations present approximately the same pattern as in the true base compositions, i.e. for every state its transition state follows uniform probability of (0.31,0.2,0.23,0.25). While the model from true model present more varying patterns among states. Besides, model fitting violates the assumption that stationary transition pattern persists and current state depends only on its previous state, since when the sequences are concatenated together the stationary probability model has been infringed.
## [1] "Here comes the fitted Markov-Chain model for the first simulated sequences:"
## MLE Fit
## A 4 - dimensional discrete Markov Chain defined by the following states:
## a, c, g, t
## The transition matrix (by rows) is defined as follows:
## a c g t
## a 0.3100863 0.2027544 0.2338267 0.2533327
## c 0.3157501 0.2021817 0.2296025 0.2524656
## g 0.3102613 0.2088927 0.2346932 0.2461528
## t 0.3122443 0.2046412 0.2346870 0.2484275
## [1] "Here comes the fitted Markov-Chain model for the second simulated sequences:"
## MLE Fit
## A 4 - dimensional discrete Markov Chain defined by the following states:
## a, c, g, t
## The transition matrix (by rows) is defined as follows:
## a c g t
## a 0.3055254 0.2034534 0.2355698 0.2554514
## c 0.3108743 0.2064545 0.2293359 0.2533353
## g 0.3129376 0.2048950 0.2348668 0.2473006
## t 0.3111111 0.2059502 0.2330905 0.2498482
## [1] "Here comes the fitted Markov-Chain model for the true sequences:"
## MLE Fit
## A 4 - dimensional discrete Markov Chain defined by the following states:
## a, c, g, t
## The transition matrix (by rows) is defined as follows:
## a c g t
## a 0.3378372 0.1732511 0.27493024 0.2139815
## c 0.3794979 0.2477837 0.05036132 0.3223571
## g 0.3936403 0.2031136 0.19330904 0.2099371
## t 0.1510704 0.2117775 0.35690460 0.2802474
The heatmaps show that simulations present only high similarity between the same sequences, i.e. all simulated sequences are hardly similar to each other, which can be explained by the random sampling. While the true sequences show significant similarity between some, e.g. JF806204/JN112661, GU457971/FJ356741.
## use default substitution matrix
## use default substitution matrix
## use default substitution matrix
The fitted trees based on 2 simulations are significantly different from the true one.
##
Running bootstraps: 100 / 1000
Running bootstraps: 200 / 1000
Running bootstraps: 300 / 1000
Running bootstraps: 400 / 1000
Running bootstraps: 500 / 1000
Running bootstraps: 600 / 1000
Running bootstraps: 700 / 1000
Running bootstraps: 800 / 1000
Running bootstraps: 900 / 1000
Running bootstraps: 1000 / 1000
## Calculating bootstrap values... done.
##
Running bootstraps: 100 / 1000
Running bootstraps: 200 / 1000
Running bootstraps: 300 / 1000
Running bootstraps: 400 / 1000
Running bootstraps: 500 / 1000
Running bootstraps: 600 / 1000
Running bootstraps: 700 / 1000
Running bootstraps: 800 / 1000
Running bootstraps: 900 / 1000
Running bootstraps: 1000 / 1000
## Calculating bootstrap values... done.
##
Running bootstraps: 100 / 1000
Running bootstraps: 200 / 1000
Running bootstraps: 300 / 1000
Running bootstraps: 400 / 1000
Running bootstraps: 500 / 1000
Running bootstraps: 600 / 1000
Running bootstraps: 700 / 1000
Running bootstraps: 800 / 1000
Running bootstraps: 900 / 1000
Running bootstraps: 1000 / 1000
## Calculating bootstrap values... done.
Values from bootstrap analysis are useful in this case, the larger the better, a higher value denotes inferred trees are more likely to repeat the original sequence pattern. Inferred tree for first simulation has mean bootstrap analysis value of 43.8, and second simulation of 46, and true sequence of 44.4.
Total Cophenetic Index measures the imbalance of any phylogenetic tree, the lower the better, a lower value denotes a perfectly resolved tree. In this case, simulation 1 has index of 606, simulation of 792 and true sequence of 612.
knitr::opts_chunk$set(eval=TRUE,echo=F,
message = F, warning = F, error = F,
fig.width=7, fig.height=5, fig.align="center")
library(ape)
library(seqinr)
library(phangorn)
library(markovchain)
library(msa)
library(seriation)
library(plotly)
library(TotalCopheneticIndex)
### part 1.1
dna<-read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_lizard_seqs.fasta",
format="fasta")
names<-names(dna)
len_seq<-NULL
arti_names<-NULL
set.seed(1234)
sim<-function(file){
arti_sequences<-NULL
for(i in 1:33){
name<-names[i]
len<-length(dna[[name]])
len_seq<<-c(len_seq,len)
arti_seq<-paste("arti_",name,sep="")
arti_names<<-c(arti_names,arti_seq)
assign(arti_seq,NULL)
for(j in 1:len)
assign(arti_seq,c(eval(parse(text=arti_seq)),sample(c('a','c','g','t'),1,prob=c(0.312,0.205,0.231,0.252))))
if(i==1)
write.fasta(eval(parse(text=arti_seq)),eval(arti_seq), file=paste("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/",file,sep=""),open="w") else
write.fasta(eval(parse(text=arti_seq)),eval(arti_seq), file=paste("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/",file,sep=""),open="a")
}}
sim("Lab2_arti1_seqs.fasta")
s<- read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti1_seqs.fasta",format="fasta")
print(s)
### part 1.2
set.seed(1234)
tr<-rphylo(33,1,0) ## rlineage(1, 0, Tmax=2.5) ##rtree(33)
tr$tip.label<-arti_names
plot(tr)
sim<-simSeq(tr, l= 1983, type="DNA", bf=c(0.312,0.205,0.231,0.252),
Q=c(0.312,0.312,0.312,0.231,0.231,0.205))
write.dna(as.DNAbin(sim),
file ="C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_seqs.fasta",
format = "fasta", append =FALSE, nbcol = 6, colsep = " ", colw = 10)
s<- read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_seqs.fasta",format="fasta")
print(s)
lizards_accession_numbers <- c("JF806202", "HM161150", "FJ356743", "JF806205",
"JQ073190", "GU457971", "FJ356741", "JF806207",
"JF806210", "AY662592", "AY662591", "FJ356748",
"JN112660", "AY662594", "JN112661", "HQ876437",
"HQ876434", "AY662590", "FJ356740", "JF806214",
"JQ073188", "FJ356749", "JQ073189", "JF806216",
"AY662598", "JN112653", "JF806204", "FJ356747",
"FJ356744", "HQ876440", "JN112651", "JF806215",
"JF806209")
true<-ape::read.GenBank(lizards_accession_numbers)
simu<-read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_seqs.fasta",format="fasta")
GCcon<-function(v){ ## v, character vector of acgt
n<-length(v)
count<-0
for(i in 1:n) if(v[i]=="c" | v[i]=="g") count<-count+1
return(count/n)
}
num2nucleo<-function(v){# convert DNAbin from (88,28,48,18) to (a,c,g,t)
n<-length(v) ## v, character vector of (88,28,48,18)
for(i in 1:n) {
if(v[i]=="88") v[i]="a"
if(v[i]=="28") v[i]="c"
if(v[i]=="48") v[i]="g"
if(v[i]=="18" | v[i]=="30") v[i]="t"
if(v[i]=="c0") v[i]=sample(c("a","g"),1,prob=c(0.5,0.5)) ## R
if(v[i]=="a0") v[i]=sample(c("a","c"),1,prob=c(0.5,0.5)) ## M
if(v[i]=="60") v[i]=sample(c("g","c"),1,prob=c(0.5,0.5)) ## S
}
return(v)}
num2prot<-function(v){# convert DNAbin from labels to protein codes
n<-length(v) ## v, character vector of labels
for(i in 1:n) {
if(v[i]=="41") v[i]="A"
if(v[i]=="57") v[i]="W"
if(v[i]=="49") v[i]="I"
if(v[i]=="54") v[i]="T"
if(v[i]=="47") v[i]="G"
if(v[i]=="53") v[i]="S"
if(v[i]=="51") v[i]="Q"
if(v[i]=="4b") v[i]="K"
if(v[i]=="4d") v[i]="M"
if(v[i]=="46") v[i]="F"
if(v[i]=="4c") v[i]="L"
if(v[i]=="43") v[i]="C"
if(v[i]=="50") v[i]="P"
if(v[i]=="2a") v[i]="*"
if(v[i]=="44") v[i]="D"
if(v[i]=="45") v[i]="E"
if(v[i]=="48") v[i]="H"
if(v[i]=="4e") v[i]="N"
if(v[i]=="52") v[i]="R"
if(v[i]=="56") v[i]="V"
if(v[i]=="59") v[i]="Y"}
return(v)}
nstop<-function(v){ ## v, character vector of acgt
n<-length(v)
count<-0
for(i in 1:(n-2)) {
temp=paste(v[i:i+2],collapse="")
if(temp %in% c("taa","tag","tga")) count<-count+1}
return(count)
}
print("Here come the brief about simulated sequences:")
for(i in 1:33) {
seq<-as.character(as.list(simu[i,])[[arti_names[i]]])
nuc<-num2nucleo(seq)
cat("For sequence",i,":\n")
cat("*Base composition:\n")
print(base.freq(simu[i,]))
cat("*GC content:\n")
print(GC.content(simu[i,]))
cat("*AT content:\n")
print(1-GC.content(simu[i,]))
cat("*number of stop codons:\n")
print(nstop(nuc))
cat("\n")
}
print("Here come the amino composition of simulated proteins:")
simu_prot<-read.FASTA("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_protein.fasta",type="AA")
for(i in 1:33){
seq<-num2prot(as.character(simu_prot[[paste(arti_names[i],"_1",sep="")]]))
aa<-levels(factor(seq))
ct<-rep(0,21)
for(j in 1:21) ct[j]<-round(sum(seq==aa[j])/length(seq),2)
cat("For sequence",i,":\n")
cat("*Amino acid composition:\n")
print(aa); cat("\n")
print(ct); cat("\n")
}
print("Here come the number of stop codons in true sequences:")
count_stop_true<-0
for(i in 1:33) {
seq<-as.character(true[[names[i]]])
nuc<-num2nucleo(seq)
count_stop_true<-count_stop_true+nstop(nuc)
}
print(count_stop_true)
## labels(simu[1,])
## (a,c,g,t)-->(88,28,48,18)
simu1<-read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti1_seqs.fasta",format="fasta")
simu2<-read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_seqs.fasta",format="fasta")
true<-read.dna("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_lizard_seqs.fasta",format="fasta")
train<-function(fasta,labels){
longseq<-NULL
if(is.null(names(fasta)))
{for(i in 1:33) longseq<-c(longseq,num2nucleo(as.character(fasta[[labels[i]]])))} else
{for(i in 1:33) longseq<-c(longseq,num2nucleo(as.character(as.list(fasta)[[labels[i]]])))}
return(markovchainFit(longseq))
}
print("Here comes the fitted Markov-Chain model for the first simulated sequences:")
longseq<-NULL
for(i in 1:33) longseq<-c(longseq,num2nucleo(as.character(simu1[[arti_names[i]]])))
markovchainFit(longseq)$estimate
print("Here comes the fitted Markov-Chain model for the second simulated sequences:")
longseq<-NULL
for(i in 1:33) longseq<-c(longseq,num2nucleo(as.character(as.list(simu2)[[arti_names[i]]])))
markovchainFit(longseq)$estimate
print("Here comes the fitted Markov-Chain model for the true sequences:")
longseq<-NULL
for(i in 1:33) longseq<-c(longseq,num2nucleo(as.character(true[[names[i]]])))
markovchainFit(longseq)$estimate
dis<-function(file){
align0<-msa(file,type="dna")
align1<- msaConvert(align0, type="seqinr::alignment") ## convert object
dis_seq<-dist.alignment(align1, "similarity")
return (list(align0=align0,dis_seq=dis_seq))}
hm<-function(dis_seq,data){
set.seed(1234)
ord_seq<-get_order(seriate(dis_seq,method="HC"))
dist<-as.matrix(dis_seq)
plot_ly(x=names[ord_seq],y=names[ord_seq],z=dist[ord_seq,ord_seq],
type="heatmap",colors =colorRamp(c("yellow", "red"))) %>%
layout(title=paste("Heatmap of",data),
xaxis=list(title="Sites"),
yaxis=list(title="Sequences")) }
dis_arti1<-dis("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti1_seqs.fasta")
hm(dis_arti1$dis_seq,"Simulations 1")
dis_arti2<-dis("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_arti2_seqs.fasta")
hm(dis_arti2$dis_seq,"Simulations 2")
dis_true<-dis("C:/Users/A550240/Desktop/LIU/3.2_BioInformatics/Lab2_lizard_seqs.fasta")
hm(dis_true$dis_seq,"True sequence")
tr1<-nj(dis_arti1$dis_seq)
tr1<-makeLabel(tr1, space="")
tr1_boot<-boot.phylo(tr1, as.matrix(dis_arti1$dis_seq), function(e) nj(as.dist(e)),1e3)
plot.phylo(tr1) # type="u" plot the unrooted phylogenetic tree
nodelabels(tr1_boot,cex=0.7) # plot the bootstrap values
tr1$node.label <- tr1_boot # make the bootstrap values be the node labels
title("A Simple NJ Tree for Simulations 1")
tr2<-nj(dis_arti2$dis_seq)
tr2<-makeLabel(tr2, space="")
tr2_boot<-boot.phylo(tr2, as.matrix(dis_arti2$dis_seq), function(e) nj(as.dist(e)),1e3)
plot.phylo(tr2) # type="u" plot the unrooted phylogenetic tree
nodelabels(tr2_boot,cex=0.7) # plot the bootstrap values
tr2$node.label <- tr2_boot # make the bootstrap values be the node labels
title("A Simple NJ Tree for Simulations 2")
tr<-nj(dis_true$dis_seq)
tr<-makeLabel(tr, space="")
tr_boot<-boot.phylo(tr, as.matrix(dis_true$dis_seq), function(e) nj(as.dist(e)),1e3)
plot.phylo(tr) # type="u" plot the unrooted phylogenetic tree
nodelabels(tr_boot,cex=0.7) # plot the bootstrap values
tr$node.label <- tr_boot # make the bootstrap values be the node labels
title("A Simple NJ Tree for True sequence")